Besides checking the results of DE analysis, also save the sets of DE genes out, to prepare for next step of gene set enrichment analysis.
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
path = "../../data/MorPhiC/June-2024/JAX_RNAseq_Neuroectoderm/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")From searching online, my guess is the model_organ content inside the bracket should be
dorsal forebrain patterning
instead of
dorsal forebrain pattering
But leave it as it is for now.
## [1] 174 32
## biomaterial_id lib_biomaterial_id differentiated_biomaterial_id
## 1 MOK20002-WT GT23-05635 CBO-MOK20002-WT-Day7
## 2 MOK20002-WT GT23-05625 CBO-MOK20002-WT-Day6
## differentiated_biomaterial_description differentiation_protocol
## 1 KOLF2.2 derived cortical brain organoid JAXPD003
## 2 KOLF2.2 derived cortical brain organoid JAXPD003
## differentiated_timepoint_value differentiated_timepoint_unit
## 1 7 days
## 2 6 days
## differentiated_terminally_differentiated
## 1 No
## 2 No
## model_organ ko_strategy ko_gene
## 1 Cortical brain (dorsal forebrain pattering) WT WT
## 2 Cortical brain (dorsal forebrain pattering) WT WT
## library_preparation_protocol dissociation_protocol fragment_size input_amount
## 1 JAXPL001 N.A 410 300
## 2 JAXPL001 N.A 425 300
## input_unit final_yield final_yield_unit lib_concentration
## 1 ngs 249.6 ngs 51.8
## 2 ngs 170.0 ngs 33.8
## lib_concentration_unit PCR_cycle PCR_cycle_sample_index
## 1 nM 10 NA
## 2 nM 10 NA
## file_id sequencing_protocol
## 1 RFX_WT_Day7_GT23-05635_GCACGGAC-TGCGAGAC_S55_L002 JAXPS001
## 2 RFX_WT_Day6_GT23-05625_GACCTGAA-CTCACCAA_S97_L002 JAXPS001
## run_id biomaterial_description
## 1 20230424_23-scbct-028-run3 KOLF2.2
## 2 20230424_23-scbct-028-run3 KOLF2.2
## derived_cell_line_accession clone_id protocol_id zygosity type model_system
## 1 KOLF2.2J WT N.A N.A iPSC CBO
## 2 KOLF2.2J WT N.A N.A iPSC CBO
## [1] "biomaterial_id"
## [2] "lib_biomaterial_id"
## [3] "differentiated_biomaterial_id"
## [4] "differentiated_biomaterial_description"
## [5] "differentiation_protocol"
## [6] "differentiated_timepoint_value"
## [7] "differentiated_timepoint_unit"
## [8] "differentiated_terminally_differentiated"
## [9] "model_organ"
## [10] "ko_strategy"
## [11] "ko_gene"
## [12] "library_preparation_protocol"
## [13] "dissociation_protocol"
## [14] "fragment_size"
## [15] "input_amount"
## [16] "input_unit"
## [17] "final_yield"
## [18] "final_yield_unit"
## [19] "lib_concentration"
## [20] "lib_concentration_unit"
## [21] "PCR_cycle"
## [22] "PCR_cycle_sample_index"
## [23] "file_id"
## [24] "sequencing_protocol"
## [25] "run_id"
## [26] "biomaterial_description"
## [27] "derived_cell_line_accession"
## [28] "clone_id"
## [29] "protocol_id"
## [30] "zygosity"
## [31] "type"
## [32] "model_system"
##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## Cortical brain (dorsal forebrain pattering) 36 36 9 27 9 36 21
##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## 20230228_23-scbct-008 0 0 0 0 9 0 1
## 20230424_23-scbct-028-run3 0 0 0 0 0 27 3
## 20230508_23-scbct-039-run2 27 0 0 0 0 0 3
## 20230522_23-scbct-052 0 27 0 0 0 0 3
## 20230824_23-scbct-092 0 0 9 0 0 0 1
## 20240131_23-robson-020 0 0 0 27 0 0 3
## 20240307_24-robson-003 0 0 0 0 0 9 1
## 20240307_24-robson-004 0 9 0 0 0 0 3
## 20240307_24-robson-006 9 0 0 0 0 0 3
##
## FEZF2 LHX5 LHX9 OTX1 PAX6 RFX4 WT
## CE 12 12 3 9 3 12 0
## KO 12 12 3 9 3 12 0
## PTC 12 12 3 9 3 12 0
## WT 0 0 0 0 0 0 21
For the row with column differentiated_biomaterial_id==“CBO-MOK20004-WT-Day3”, it should have value 3 instead of the original 4 in the column differentiated_timepoint_value.
## [1] "Before correcting the error in timepoint:"
## [1] "Table between ko_gene and differentiated_timepoint_value:"
##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 9 9 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 0 2 2 5 4 5 2 1
meta[which(meta$differentiated_biomaterial_id=="CBO-MOK20004-WT-Day3"),
"differentiated_timepoint_value"] = 3
print("After correcting the error in timepoint:")## [1] "After correcting the error in timepoint:"
## [1] "Table between ko_gene and differentiated_timepoint_value:"
##
## 3 4 5 6 7 8 9 14
## FEZF2 0 0 9 9 9 9 0 0
## LHX5 9 9 9 9 0 0 0 0
## LHX9 0 0 0 0 0 0 0 9
## OTX1 0 0 0 0 9 9 9 0
## PAX6 0 0 0 0 9 0 0 0
## RFX4 0 0 0 9 9 9 9 0
## WT 1 1 2 5 4 5 2 1
## [1] 38592 175
## Name RFX_CE_F5_Day8_GT23-05641_GTCGGAGC-TTATAACC_S73_L002
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1521
## LHX5_CE_A1_Day3_GT23-07300_AAGTCCAA-TACTCATA_S163_L004
## 1 0
## 2 1270
## LHX5_KO_E4_Day5_GT23-07318_AACGTTCC-AGTACTCC_S166_L004
## 1 0
## 2 746
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 174
## model_s
## CBO
## 174
##
## Cortical brain (dorsal forebrain pattering)
## CBO 174
get_q75 <- function(x){quantile(x, probs = 0.75)}
# given that there is only one level for model_system
# the result from apply(cts_dat, 1, tapply, model_s, min) is no longer a matrix
# but a vector
# do not do transpose to the results from apply(cts_dat, 1, tapply, model_s, min)
gene_info = data.frame(Name = cts$Name,
apply(cts_dat, 1, tapply, model_s, min),
apply(cts_dat, 1, tapply, model_s, median),
apply(cts_dat, 1, tapply, model_s, get_q75),
apply(cts_dat, 1, tapply, model_s, max))
dim(gene_info)## [1] 38592 5
## Name apply.cts_dat..1..tapply..model_s..min.
## ENSG00000268674 ENSG00000268674 0
## ENSG00000271254 ENSG00000271254 426
## apply.cts_dat..1..tapply..model_s..median.
## ENSG00000268674 0
## ENSG00000271254 1318
## apply.cts_dat..1..tapply..model_s..get_q75.
## ENSG00000268674 0.00
## ENSG00000271254 1701.75
## apply.cts_dat..1..tapply..model_s..max.
## ENSG00000268674 3
## ENSG00000271254 3117
##
## TRUE
## 38592
names(gene_info)[2:5] = paste0(rep(c("CBO_"), times=4),
rep(c("min", "median", "q75", "max"), each=1))
dim(gene_info)## [1] 38592 5
## Name CBO_min CBO_median CBO_q75 CBO_max
## ENSG00000268674 ENSG00000268674 0 0 0.00 3
## ENSG00000271254 ENSG00000271254 426 1318 1701.75 3117
## CBO_min CBO_median CBO_q75 CBO_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 2
## Median : 0.0 Median : 3.0 Median : 5.8 Median : 22
## Mean : 213.8 Mean : 699.1 Mean : 931.4 Mean : 1829
## 3rd Qu.: 89.0 3rd Qu.: 332.1 3rd Qu.: 474.6 3rd Qu.: 1003
## Max. :88969.0 Max. :489181.5 Max. :586205.0 Max. :1301990
##
## FALSE TRUE
## 14390 24202
## [1] 24202 174
## [1] 62754 8
## geneId chr strand start end ensembl_gene_id
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
# all genes are contained in the annotation file
table(gene_info$Name %in% gene_anno$ensembl_gene_id)##
## TRUE
## 24202
gene_info = merge(gene_anno, gene_info, by.x="ensembl_gene_id", by.y = "Name",
all.x = FALSE, all.y = TRUE)
dim(gene_info)## [1] 24202 12
## ensembl_gene_id geneId chr strand start end
## 1: ENSG00000000003 ENSG00000000003.16 chrX - 100627108 100639991
## 2: ENSG00000000005 ENSG00000000005.6 chrX + 100584936 100599885
## hgnc_symbol description CBO_min
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] 744
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] 0
## CBO_median CBO_q75 CBO_max
## 1: 2644.0 3235.00 5449
## 2: 9.5 20.75 97
## Empty data.table (0 rows and 12 cols): ensembl_gene_id,geneId,chr,strand,start,end...
## CBO_min hgnc_symbol
## 1: 22057 ACTB
## 2: 23713 HNRNPA1
## 3: 66180 EEF1A1
## 4: 23858 TUBB
## 5: 88639 MT-CO1
## 6: 29947 MT-ND4
## 7: 39231 MT-CO3
## 8: 88969 MALAT1
##
## FALSE TRUE
## 19181 4959
##
## FALSE TRUE
## 24140 62
wEns = which(gene_info$gene_symbol == "" | is.na(gene_info$gene_symbol))
gene_info$gene_symbol[wEns] = gene_info$ensembl_gene_id[wEns]
table(gene_info$gene_symbol == "")##
## FALSE
## 24202
##
## FALSE
## 24202
For this dataset, there is only one model system, and for each day group and each knockout gene, there are three knockout strategies.
Do it separately by timepoint (day).
day_group_strings = c("3_4", "5", "6", "7", "8", "9")
# define a function to extract the model_daygroup_gene part of the result filenames
# under the assumption that the desired part is the one before the
# second-to-last underscore
extract_model_day_group <- function(x) {
parts <- unlist(str_split(x, "_"))
return(paste(parts[1:(length(parts) - 2)], collapse = "_"))
}
for (day_group in day_group_strings){
res = fread(paste0("results/2024_06_JAX_RNAseq_Neuroectoderm/",
"2024_06_JAX_RNAseq_Neuroectoderm_CBO_",
day_group, "_DEseq2.tsv"), data.table = FALSE)
print(res[1:2,1:7])
items = names(res)[grep("_padj", names(res))]
items = unique(sapply(items, extract_model_day_group))
items
fc0 = log2(1.5)
p0 = 0.05
gene_symbols = str_extract(items, "[a-zA-Z0-9]+$")
gene_symbols
table(gene_symbols %in% gene_info$gene_symbol)
df = data.frame(KO = items, gene_symbol = gene_symbols,
CE_nDE = NA, KO_nDE = NA, PTC_nDE = NA,
CE_padj = NA, CE_log2FoldChange = NA,
KO_padj = NA, KO_log2FoldChange = NA,
PTC_padj = NA, PTC_log2FoldChange = NA)
df$Model = str_extract(df$KO, "^[a-zA-Z0-9]+")
gene_ids = gene_info$ensembl_gene_id[match(gene_symbols, gene_info$gene_symbol)]
for(k in 1:length(items)){
it_k = items[k]
gene_id = gene_ids[k]
w_gene = which(res$gene_ID == gene_id)
stopifnot(length(w_gene) == 1)
for(ko1 in c("CE", "KO", "PTC")){
fc = paste0(it_k, "_", ko1, "_log2FoldChange")
padj = paste0(it_k, "_", ko1, "_padj")
col_name = paste0(ko1, "_nDE")
df[k,col_name] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
col_name = paste0(ko1, "_log2FoldChange")
df[k,col_name] = res[[fc]][w_gene]
col_name = paste0(ko1, "_padj")
df[k,col_name] = res[[padj]][w_gene]
}
}
print(df)
p1 <- ggplot(df, aes(x=(PTC_nDE), y=(CE_nDE),
label=gene_symbol)) +
geom_point() + scale_shape_manual(values = c(7, 16)) +
geom_text_repel() + ggtitle("# of DE genes") +
labs(x="PTC", y="CE") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p2 <- ggplot(df, aes(x=(PTC_nDE), y=(KO_nDE),
label=gene_symbol)) +
geom_point() + scale_shape_manual(values = c(7, 16)) +
geom_text_repel() + ggtitle("# of DE genes") +
labs(x="PTC", y="KO") +
geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")
p3 <- ggplot(df, aes(x=CE_log2FoldChange, y=-log10(CE_padj),
label=gene_symbol)) +
geom_point(size=2) + scale_shape_manual(values = c(7, 16)) +
geom_text_repel() + ggtitle("DE results of the KO genes, CE") +
labs(x="log2 fold change", y="-log10(p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p4 <- ggplot(df, aes(x=KO_log2FoldChange, y=-log10(KO_padj),
label=gene_symbol)) +
geom_point(size=2) + scale_shape_manual(values = c(7, 16)) +
geom_text_repel() + ggtitle("DE results of the KO genes, KO") +
labs(x="log2 fold change", y="-log10(p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
p5 <- ggplot(df, aes(x=PTC_log2FoldChange, y=-log10(PTC_padj),
label=gene_symbol)) +
geom_point(size=2) + scale_shape_manual(values = c(7, 16)) +
geom_text_repel() + ggtitle("DE results of the KO genes, PTC") +
labs(x="log2 fold change", y="-log10(p-value)") +
geom_hline(yintercept = 2, color = "grey", linetype = "dashed") +
geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
linetype = "dashed")
g_combined <- ggarrange(p1, p2, p3, p4, p5, ncol = 2, nrow = 3)
print(annotate_figure(g_combined,
top = text_grob(paste0("CBO, day group ", day_group),
face = "bold", size = 16)))
}## gene_ID CBO_3_4_LHX5_CE_baseMean CBO_3_4_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 1330.00 0.0349
## 2 ENSG00000276345 2.54 0.9730
## CBO_3_4_LHX5_CE_lfcSE CBO_3_4_LHX5_CE_stat CBO_3_4_LHX5_CE_pvalue
## 1 0.171 0.204 0.838
## 2 2.370 0.410 0.682
## CBO_3_4_LHX5_CE_padj
## 1 1
## 2 1
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_3_4_LHX5 LHX5 1 4 5 1 0.366
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 1 -1.16 1 -2.08 CBO
## gene_ID CBO_5_LHX5_CE_baseMean CBO_5_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 1180.00 0.216
## 2 ENSG00000276345 1.45 2.600
## CBO_5_LHX5_CE_lfcSE CBO_5_LHX5_CE_stat CBO_5_LHX5_CE_pvalue
## 1 0.123 1.760 0.0788
## 2 3.410 0.761 0.4470
## CBO_5_LHX5_CE_padj
## 1 0.698
## 2 NA
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_5_LHX5 LHX5 2 172 1 0.96 0.101
## 2 CBO_5_FEZF2 FEZF2 182 9 22 NA -4.640
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 0.978 -0.0183 0.0949 -1.06 CBO
## 2 NA -5.6900 NA -1.05 CBO
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
## gene_ID CBO_6_LHX5_CE_baseMean CBO_6_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 866.00 0.244
## 2 ENSG00000276345 1.19 12.300
## CBO_6_LHX5_CE_lfcSE CBO_6_LHX5_CE_stat CBO_6_LHX5_CE_pvalue
## 1 0.144 1.69 0.09080
## 2 3.210 3.83 0.00013
## CBO_6_LHX5_CE_padj
## 1 0.8980
## 2 0.0286
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_6_LHX5 LHX5 100 22 32 0.545 -1.290
## 2 CBO_6_FEZF2 FEZF2 37 32 41 NA -2.130
## 3 CBO_6_RFX4 RFX4 45 48 41 1.000 -0.636
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 1.00000 -0.427 1 -0.3620 CBO
## 2 0.00326 -6.270 1 -0.0372 CBO
## 3 0.95800 0.558 1 -0.8610 CBO
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_text_repel()`).
## gene_ID CBO_7_FEZF2_CE_baseMean CBO_7_FEZF2_CE_log2FoldChange
## 1 ENSG00000271254 1580 -0.1590
## 2 ENSG00000277196 1300 -0.0632
## CBO_7_FEZF2_CE_lfcSE CBO_7_FEZF2_CE_stat CBO_7_FEZF2_CE_pvalue
## 1 0.179 -0.892 0.372
## 2 0.281 -0.225 0.822
## CBO_7_FEZF2_CE_padj
## 1 0.686
## 2 0.935
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_7_FEZF2 FEZF2 1 5 2 NA -0.296
## 2 CBO_7_PAX6 PAX6 1 1 1 1 -0.567
## 3 CBO_7_RFX4 RFX4 3 0 0 1 -1.290
## 4 CBO_7_OTX1 OTX1 4 1 5 1 0.606
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 2.63e-06 -4.010 1 1.6100 CBO
## 2 1.00e+00 1.440 1 -1.2700 CBO
## 3 1.00e+00 -0.839 1 -0.9720 CBO
## 4 1.00e+00 0.468 1 -0.0452 CBO
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_text_repel()`).
## gene_ID CBO_8_RFX4_CE_baseMean CBO_8_RFX4_CE_log2FoldChange
## 1 ENSG00000271254 1320 0.143
## 2 ENSG00000277196 370 -0.494
## CBO_8_RFX4_CE_lfcSE CBO_8_RFX4_CE_stat CBO_8_RFX4_CE_pvalue
## 1 0.158 0.908 0.3640
## 2 0.266 -1.850 0.0636
## CBO_8_RFX4_CE_padj
## 1 1
## 2 1
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_8_RFX4 RFX4 74 70 72 1.000 -1.040
## 2 CBO_8_OTX1 OTX1 37 84 47 0.267 1.300
## 3 CBO_8_FEZF2 FEZF2 37 71 64 1.000 -0.341
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 1.00e+00 -0.419 1.000 -1.360 CBO
## 2 6.69e-01 0.719 1.000 0.328 CBO
## 3 1.12e-25 -3.710 0.102 0.808 CBO
## gene_ID CBO_9_RFX4_CE_baseMean CBO_9_RFX4_CE_log2FoldChange
## 1 ENSG00000271254 1470 0.291
## 2 ENSG00000277196 163 -0.031
## CBO_9_RFX4_CE_lfcSE CBO_9_RFX4_CE_stat CBO_9_RFX4_CE_pvalue
## 1 0.163 1.780 0.0745
## 2 0.232 -0.134 0.8940
## CBO_9_RFX4_CE_padj
## 1 1
## 2 1
## KO gene_symbol CE_nDE KO_nDE PTC_nDE CE_padj CE_log2FoldChange
## 1 CBO_9_RFX4 RFX4 59 79 54 1 -1.150
## 2 CBO_9_OTX1 OTX1 52 70 61 1 0.614
## KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange Model
## 1 0.273 -2.660 1 -1.22 CBO
## 2 1.000 0.447 1 0.38 CBO
In some (day group, knockout gene, knockout strategy) combinations, there are too many DE genes and they cannot all be labeled in the valcano plots.
for (day_group in day_group_strings){
# create a list to store DE gene set for the gene set enrichment analysis
de_gene_sets = list()
print(day_group)
res = fread(paste0("results/2024_06_JAX_RNAseq_Neuroectoderm/",
"2024_06_JAX_RNAseq_Neuroectoderm_CBO_",
day_group, "_DEseq2.tsv"), data.table = FALSE)
print(res[1:2,1:7])
items = names(res)[grep("_padj", names(res))]
items
for(it1 in items){
id1 = gsub("_padj", "", it1)
res_k = res[,c(1, grep(id1, names(res)))]
names(res_k) = gsub(paste0(id1, "_"), "", names(res_k))
ww_up = which((res_k$log2FoldChange > log2(1.5)) & (res_k$padj < 0.05))
ww_down = which((res_k$log2FoldChange < -log2(1.5)) & (res_k$padj < 0.05))
de_gene_sets[[paste0(id1, "_up")]] = res_k[ww_up,]$gene_ID
de_gene_sets[[paste0(id1, "_down")]] = res_k[ww_down,]$gene_ID
res_k$DE = "No"
res_k$DE[ww_up] <- "Up"
res_k$DE[ww_down] <- "Down"
res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
col2use = c("blue", "grey", "red")
names(col2use) = c("Down", "No", "Up")
res_k$delabel = NA
res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID,
gene_info$ensembl_gene_id)]
w_de = which(res_k$DE != "No")
res_k$delabel[w_de] = res_k$gene_symbol[w_de]
print(table(res_k$DE))
print("--------------------------------------------------")
print(paste0("CBO, day group ", day_group))
print("--------------------------------------------------")
g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj),
col=DE, label=delabel)) +
geom_point() + ggtitle(id1) +
geom_text_repel(point.padding = 0.5,
min.segment.length = 0,
size = 3, # Adjust text size
box.padding = 0.5,
max.overlaps = 20,
show.legend = FALSE) +
scale_color_manual(values=col2use)
print(g2)
}
# save out the DE gene lists for gene set enrichment analysis
output_dir = "results/2024_06_JAX_RNAseq_Neuroectoderm_DE_gene_lists"
if (!file.exists(output_dir)){
dir.create(output_dir)
}
saveRDS(de_gene_sets,
sprintf("%s/2024_06_JAX_RNAseq_Neuroectoderm_CBO_%s_DE_gene_list.rds",
output_dir, day_group))
}## [1] "3_4"
## gene_ID CBO_3_4_LHX5_CE_baseMean CBO_3_4_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 1330.00 0.0349
## 2 ENSG00000276345 2.54 0.9730
## CBO_3_4_LHX5_CE_lfcSE CBO_3_4_LHX5_CE_stat CBO_3_4_LHX5_CE_pvalue
## 1 0.171 0.204 0.838
## 2 2.370 0.410 0.682
## CBO_3_4_LHX5_CE_padj
## 1 1
## 2 1
##
## No Up
## 23241 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 3_4"
## [1] "--------------------------------------------------"
## Warning: Removed 36 rows containing missing values (`geom_point()`).
## Warning: Removed 23241 rows containing missing values (`geom_text_repel()`).
##
## Down No
## 4 23238
## [1] "--------------------------------------------------"
## [1] "CBO, day group 3_4"
## [1] "--------------------------------------------------"
## Warning: Removed 36 rows containing missing values (`geom_point()`).
## Warning: Removed 23238 rows containing missing values (`geom_text_repel()`).
##
## Down No
## 5 23237
## [1] "--------------------------------------------------"
## [1] "CBO, day group 3_4"
## [1] "--------------------------------------------------"
## Warning: Removed 36 rows containing missing values (`geom_point()`).
## Warning: Removed 23237 rows containing missing values (`geom_text_repel()`).
## [1] "5"
## gene_ID CBO_5_LHX5_CE_baseMean CBO_5_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 1180.00 0.216
## 2 ENSG00000276345 1.45 2.600
## CBO_5_LHX5_CE_lfcSE CBO_5_LHX5_CE_stat CBO_5_LHX5_CE_pvalue
## 1 0.123 1.760 0.0788
## 2 3.410 0.761 0.4470
## CBO_5_LHX5_CE_padj
## 1 0.698
## 2 NA
##
## Down No
## 2 23565
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 5034 rows containing missing values (`geom_point()`).
## Warning: Removed 23565 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 13 23395 159
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 12793 rows containing missing values (`geom_point()`).
## Warning: Removed 23395 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 157 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No
## 1 23566
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Removed 23566 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 54 23385 128
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 10510 rows containing missing values (`geom_point()`).
## Warning: Removed 23385 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 161 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 2 23558 7
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 14164 rows containing missing values (`geom_point()`).
## Warning: Removed 23558 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 4 23545 18
## [1] "--------------------------------------------------"
## [1] "CBO, day group 5"
## [1] "--------------------------------------------------"
## Warning: Removed 12338 rows containing missing values (`geom_point()`).
## Warning: Removed 23545 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "6"
## gene_ID CBO_6_LHX5_CE_baseMean CBO_6_LHX5_CE_log2FoldChange
## 1 ENSG00000271254 866.00 0.244
## 2 ENSG00000276345 1.19 12.300
## CBO_6_LHX5_CE_lfcSE CBO_6_LHX5_CE_stat CBO_6_LHX5_CE_pvalue
## 1 0.144 1.69 0.09080
## 2 3.210 3.83 0.00013
## CBO_6_LHX5_CE_padj
## 1 0.8980
## 2 0.0286
##
## Down No Up
## 27 23811 73
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23811 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 67 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 9 23889 13
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23889 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 14 23879 18
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23879 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 15 23874 22
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 9735 rows containing missing values (`geom_point()`).
## Warning: Removed 23874 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 12 23879 20
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23879 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 16 23870 25
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23870 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 18 23866 27
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23866 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 9 23863 39
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23863 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 8 23870 33
## [1] "--------------------------------------------------"
## [1] "CBO, day group 6"
## [1] "--------------------------------------------------"
## Warning: Removed 23870 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "7"
## gene_ID CBO_7_FEZF2_CE_baseMean CBO_7_FEZF2_CE_log2FoldChange
## 1 ENSG00000271254 1580 -0.1590
## 2 ENSG00000277196 1300 -0.0632
## CBO_7_FEZF2_CE_lfcSE CBO_7_FEZF2_CE_stat CBO_7_FEZF2_CE_pvalue
## 1 0.179 -0.892 0.372
## 2 0.281 -0.225 0.822
## CBO_7_FEZF2_CE_padj
## 1 0.686
## 2 0.935
##
## Down No
## 1 23664
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 14223 rows containing missing values (`geom_point()`).
## Warning: Removed 23664 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 3 23660 2
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23660 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 1 23663 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23663 rows containing missing values (`geom_text_repel()`).
##
## No Up
## 23664 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23664 rows containing missing values (`geom_text_repel()`).
##
## No Up
## 23664 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 23664 rows containing missing values (`geom_text_repel()`).
##
## No Up
## 23664 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 23664 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 1 23662 2
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23662 rows containing missing values (`geom_text_repel()`).
##
## No
## 23665
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23665 rows containing missing values (`geom_text_repel()`).
##
## No
## 23665
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 23665 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 1 23661 3
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23661 rows containing missing values (`geom_text_repel()`).
##
## No Up
## 23664 1
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23664 rows containing missing values (`geom_text_repel()`).
##
## Down No Up
## 1 23660 4
## [1] "--------------------------------------------------"
## [1] "CBO, day group 7"
## [1] "--------------------------------------------------"
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 23660 rows containing missing values (`geom_text_repel()`).
## [1] "8"
## gene_ID CBO_8_RFX4_CE_baseMean CBO_8_RFX4_CE_log2FoldChange
## 1 ENSG00000271254 1320 0.143
## 2 ENSG00000277196 370 -0.494
## CBO_8_RFX4_CE_lfcSE CBO_8_RFX4_CE_stat CBO_8_RFX4_CE_pvalue
## 1 0.158 0.908 0.3640
## 2 0.266 -1.850 0.0636
## CBO_8_RFX4_CE_padj
## 1 1
## 2 1
##
## Down No Up
## 23 23194 51
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23194 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 19 23198 51
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23198 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 26 23196 46
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23196 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 14 23231 23
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23231 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 35 23184 49
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23184 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 23 23221 24
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23221 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 10 23231 27
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23231 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 48 23197 23
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23197 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 40 23204 24
## [1] "--------------------------------------------------"
## [1] "CBO, day group 8"
## [1] "--------------------------------------------------"
## Warning: Removed 23204 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "9"
## gene_ID CBO_9_RFX4_CE_baseMean CBO_9_RFX4_CE_log2FoldChange
## 1 ENSG00000271254 1470 0.291
## 2 ENSG00000277196 163 -0.031
## CBO_9_RFX4_CE_lfcSE CBO_9_RFX4_CE_stat CBO_9_RFX4_CE_pvalue
## 1 0.163 1.780 0.0745
## 2 0.232 -0.134 0.8940
## CBO_9_RFX4_CE_padj
## 1 1
## 2 1
##
## Down No Up
## 11 22864 48
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22864 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 23 22844 56
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22844 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 33 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 17 22869 37
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22869 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 13 22871 39
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22871 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 25 22853 45
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22853 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
##
## Down No Up
## 21 22862 40
## [1] "--------------------------------------------------"
## [1] "CBO, day group 9"
## [1] "--------------------------------------------------"
## Warning: Removed 22862 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 31 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7652511 408.7 12436565 664.2 NA 12436565 664.2
## Vcells 22994134 175.5 56893558 434.1 65536 56893558 434.1
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.2 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.14.8
## [5] dplyr_1.1.2 ggpointdensity_0.1.0
## [7] viridis_0.6.2 viridisLite_0.4.1
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.0.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-58.2
## [21] stringr_1.5.0 ggpubr_0.6.0
## [23] ggrepel_0.9.3 ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.6 tools_4.2.3 backports_1.4.1
## [7] bslib_0.4.2 utf8_1.2.3 R6_2.5.1
## [10] DBI_1.1.3 colorspace_2.1-0 GetoptLong_1.0.5
## [13] withr_2.5.0 tidyselect_1.2.0 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.1
## [19] DelayedArray_0.24.0 labeling_0.4.2 sass_0.4.5
## [22] scales_1.2.1 digest_0.6.31 rmarkdown_2.21
## [25] XVector_0.38.0 pkgconfig_2.0.3 htmltools_0.5.5
## [28] fastmap_1.1.1 GlobalOptions_0.1.2 rlang_1.1.0
## [31] rstudioapi_0.14 RSQLite_2.3.1 farver_2.1.1
## [34] shape_1.4.6 jquerylib_0.1.4 generics_0.1.3
## [37] jsonlite_1.8.4 BiocParallel_1.32.6 car_3.1-2
## [40] RCurl_1.98-1.12 magrittr_2.0.3 GenomeInfoDbData_1.2.9
## [43] Matrix_1.6-4 Rcpp_1.0.10 munsell_0.5.0
## [46] fansi_1.0.4 abind_1.4-5 lifecycle_1.0.3
## [49] stringi_1.7.12 yaml_2.3.7 carData_3.0-5
## [52] zlibbioc_1.44.0 plyr_1.8.8 blob_1.2.4
## [55] parallel_4.2.3 crayon_1.5.2 lattice_0.20-45
## [58] cowplot_1.1.1 Biostrings_2.66.0 annotate_1.76.0
## [61] circlize_0.4.15 KEGGREST_1.38.0 locfit_1.5-9.8
## [64] knitr_1.44 pillar_1.9.0 rjson_0.2.21
## [67] ggsignif_0.6.4 geneplotter_1.76.0 codetools_0.2-19
## [70] XML_3.99-0.14 glue_1.6.2 evaluate_0.20
## [73] foreach_1.5.2 vctrs_0.6.2 png_0.1-8
## [76] cellranger_1.1.0 gtable_0.3.3 purrr_1.0.1
## [79] tidyr_1.3.0 clue_0.3-65 cachem_1.0.7
## [82] xfun_0.39 xtable_1.8-4 broom_1.0.4
## [85] rstatix_0.7.2 tibble_3.2.1 iterators_1.0.14
## [88] AnnotationDbi_1.60.2 memoise_2.0.1 cluster_2.1.4